Suggested citation: Madaga, Lavinia, Jordan Chamberlin, Bisrat Gebrekidan, Robert Hijmans, Nicholaus Kuboja, and Maxwell Mkondiwa. 2024. “Predictive mapping of wholesale grain prices for rural areas in Tanzania.” https://github.com/EiA2030-ex-ante/Tanzania-Price-Data

Introduction

We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, using monthly data from across Tanzania over the period May 2021-July 2024.

This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.

Load Libraries

library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(gdistance)
## Warning: package 'gdistance' was built under R version 4.3.3
## Warning: package 'igraph' was built under R version 4.3.3
library(sp)
library(raster)
library(foreign)
library(gdistance)

Data

This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.

setwd("H:/Tanzania Price data/Datasets")

prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
## [1] 8961   21
head(prices)
##           Region   Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1:        Arusha   Arusha             43000             45000           140000
## 2: Dar es Salaam   Temeke             46000             47000           120000
## 3:        Dodoma  Majengo             45000             50000           132000
## 4:        Dodoma Kibaigwa             30000             42000               NA
## 5:        Kagera   Bukoba             33000             35000           110000
## 6:       Manyara   Babati             30000             45000           120000
##    Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1:           170000               60000               65000
## 2:           210000               80000              100000
## 3:           200000               50000               60000
## 4:               NA               45000               48000
## 5:           140000              140000              150000
## 6:           170000               60000               80000
##    Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1:                      70000                      75000
## 2:                      80000                     100000
## 3:                      52000                      64000
## 4:                         NA                         NA
## 5:                     120000                     140000
## 6:                      80000                     100000
##    Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1:                    120000                    125000             70000
## 2:                    175000                    180000            110000
## 3:                    200000                    252000                NA
## 4:                        NA                        NA                NA
## 5:                    170000                    180000            170000
## 6:                    120000                    130000            100000
##    Wheat..max.price. Beans..min.price. Beans..max.price.
## 1:             78000            130000            165000
## 2:            120000            220000            260000
## 3:                NA            200000            245000
## 4:                NA                NA                NA
## 5:            180000            110000            170000
## 6:            120000            150000            180000
##    Irish.Potatoes..min.price. Irish.Potatoes..max.price.     Date Latitude
## 1:                      70000                      75000 5/5/2021 -3.36968
## 2:                      75000                      75000 5/5/2021 -6.85097
## 3:                      56000                      68000 5/5/2021 -6.17971
## 4:                         NA                         NA 5/5/2021 -6.08110
## 5:                      60000                      75000 5/5/2021 -1.33140
## 6:                      60000                      60000 5/5/2021 -4.21006
##    Longitude
## 1:  36.68808
## 2:  39.25672
## 3:  35.74109
## 4:  36.64645
## 5:  31.81293
## 6:  35.74915
table(prices$Market)
## 
##       Arusha       Babati      Bariadi     Buguruni       Bukoba      Igawilo 
##          374          278          113           10          424           73 
##        Ilala       Iringa     Kibaigwa       Kigoma    Kilombero    Kinondoni 
##          238          427          211          282           73          203 
##        Lindi      majengo      Majengo      Manyara        Mbeya     Mgandini 
##          281           61          447          116          117           73 
##      Mnalani     Morogoro        Moshi       Mpanda      Mpimbwe       Mtwara 
##           24          421          287          195           39          435 
##       Musoma Mwananyamala       Mwanza       Namfua       Njombe    Nyankumbu 
##          322           73          384           73          174          111 
##        Pwani    Shinyanga      Singida       Songea       Songwe   Sumbawanga 
##           89          317           83          396           83          420 
##       Tabora      Tandale      Tandika        Tanga       Temeke       Ubungo 
##          399           90           99          361          204           81
sapply(prices, class)
##                     Region                     Market 
##                "character"                "character" 
##          Maize..min.price.          Maize..max.price. 
##                  "integer"                  "integer" 
##           Rice..min.price.           Rice..max.price. 
##                  "integer"                  "integer" 
##        Sorghum..min.price.        Sorghum..max.price. 
##                  "integer"                  "integer" 
## Bulrush.Millet..min.price. Bulrush.Millet..max.price. 
##                  "integer"                  "integer" 
##  Finger.Millet..min.price.  Finger.Millet..max.price. 
##                  "integer"                  "integer" 
##          Wheat..min.price.          Wheat..max.price. 
##                  "integer"                  "integer" 
##          Beans..min.price.          Beans..max.price. 
##                  "integer"                  "integer" 
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. 
##                  "integer"                  "integer" 
##                       Date                   Latitude 
##                "character"                  "numeric" 
##                  Longitude 
##                  "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)

Basic Data preperation

Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded

unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
##       Market Latitude Longitude
## 1:    Arusha -3.36968  36.68808
## 2: Kilombero -3.37324  36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
##          Market  Latitude Longitude
## 1:       Temeke -6.850970  39.25672
## 2:    Kinondoni -6.784070  39.27007
## 3:        Ilala -6.829410  39.26289
## 4:      Tandika -6.867912  39.25480
## 5:     Buguruni -6.838380  39.24359
## 6:      Tandale -6.795230  39.24085
## 7:       Ubungo -6.793620  39.20966
## 8: Mwananyamala -6.788890  39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
##      Market Latitude Longitude
## 1:  Majengo -6.17971  35.74109
## 2: Kibaigwa -6.08110  36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Bukoba  -1.3314  31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:  Babati -4.21006  35.74915
## 2: Manyara -4.46011  37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
##        Market Latitude Longitude
## 1: Sumbawanga -7.95239  31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
##    Market  Latitude Longitude
## 1: Mtwara -10.28009  40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Tabora -5.01972  32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
##      Market  Latitude Longitude
## 1:    Tanga -5.074260  39.09993
## 2: Mgandini -5.086235  39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Iringa -7.78001  35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Kigoma -4.89697  29.65987
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
##      Market Latitude Longitude
## 1: Morogoro -6.82771  37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Mwanza -2.51969  32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Musoma -1.49982   33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
##    Market  Latitude Longitude
## 1: Songea -10.67873  35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
##       Market Latitude Longitude
## 1: Shinyanga -3.67226  33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1:  Moshi -3.34865  37.34352
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
##     Market  Latitude Longitude
## 1:   Mbeya -8.909940  33.45517
## 2: Igawilo -8.924246  33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:  Mpanda -6.34295  31.07299
## 2: Mpimbwe -7.24425  31.81782
## 3: majengo -6.34809  31.07055
## 4: Majengo -6.34809  31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Njombe -9.33805  34.76949
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1:  Lindi   -9.995    39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
##     Market  Latitude Longitude
## 1: Singida -4.812610   34.7428
## 2:  Namfua -4.821121   34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1:   Pwani -6.44221  38.90622
## 2: Mnalani -6.44221  38.90622
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
##     Market Latitude Longitude
## 1: Bariadi -2.80355  33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
##       Market  Latitude Longitude
## 1: Nyankumbu -2.870760  32.23408
## 2: Nyankumbu -2.895955  32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
##    Market Latitude Longitude
## 1: Songwe -8.95179  33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
##        Region        Market mai.price.min mai.price.max ric.price.min 
##   "character"   "character"     "integer"     "integer"     "integer" 
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
## bea.price.max pot.price.min pot.price.max          Date      Latitude 
##     "integer"     "integer"     "integer"        "Date"     "numeric" 
##     Longitude 
##     "numeric"
#convert prices to numeric 
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)

prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)

sapply(prices, class)
##        Region        Market mai.price.min mai.price.max ric.price.min 
##   "character"   "character"     "numeric"     "numeric"     "numeric" 
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
## bea.price.max pot.price.min pot.price.max          Date      Latitude 
##     "numeric"     "numeric"     "numeric"        "Date"     "numeric" 
##     Longitude 
##     "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100

prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day   <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year  <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
                     "ric.price.min", "ric.price.max",
                     "sor.price.min", "sor.price.max",
                     "bul.price.min", "bul.price.max",
                     "fin.price.min", "fin.price.max",
                     "whe.price.min", "whe.price.max", 
                     "bea.price.min", "bea.price.max", 
                     "pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market 
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE), 
           ric.price = mean(ric.price, na.rm = TRUE), 
           sor.price = mean(sor.price, na.rm = TRUE), 
           bul.price = mean(bul.price, na.rm = TRUE),
           fin.price = mean(fin.price, na.rm = TRUE), 
           whe.price = mean(whe.price, na.rm = TRUE), 
           bea.price = mean(bea.price, na.rm = TRUE), 
           pot.price = mean(pot.price, na.rm = TRUE)), 
       by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly 
##             Region     Market Month Year Latitude Longitude mai.price ric.price
##   1:        Arusha     Arusha     5 2021 -3.36968  36.68808    469.00  1530.000
##   2: Dar es Salaam     Temeke     5 2021 -6.85097  39.25672    485.00  1650.000
##   3:        Dodoma    Majengo     5 2021 -6.17971  35.74109    501.50  1592.000
##   4:        Dodoma   Kibaigwa     5 2021 -6.08110  36.64645    375.00       NaN
##   5:        Kagera     Bukoba     5 2021 -1.33140  31.81293    426.25  1260.000
##  ---                                                                           
## 950:        Mwanza     Mwanza    10 2024 -2.51969  32.90144    735.00  2016.667
## 951:         Pwani    Mnalani    10 2024 -6.44221  38.90622    925.00  1925.000
## 952:        Simiyu    Bariadi    10 2024 -2.80355  33.98699    667.50  1466.667
## 953:        Kigoma     Kigoma    10 2024 -4.89697  29.65987    667.00  1750.000
## 954:         Rukwa Sumbawanga    10 2024 -7.95239  31.62319    651.25  1916.667
##      sor.price bul.price fin.price whe.price bea.price pot.price
##   1:   695.000   761.000  1333.000     793.0  1545.000  745.0000
##   2:   900.000   900.000  1775.000    1230.0  2420.000  730.0000
##   3:   558.000   581.000  1752.000       NaN  2075.000  618.0000
##   4:   437.500       NaN       NaN       NaN       NaN       NaN
##   5:  1375.000  1337.500  1637.500    1637.5  1300.000  675.0000
##  ---                                                            
## 950:  1416.667  1283.333  1583.333       NaN  2800.000  991.6667
## 951:  1525.000  1800.000  2016.667    2000.0  2920.833 1500.0000
## 952:  1420.833  1825.000  1825.000    2500.0  2845.833 1450.0000
## 953:  1000.000  1500.000  2000.000    1700.0  2025.000  900.0000
## 954:       NaN       NaN   975.000     725.0  2437.500  670.8333
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize    Rice     Sorghum  B.Millet F.Millet Wheat    Beans    Potato  
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize   := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice    := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat   := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans   := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato  := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1  , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2  , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3  , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4  , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5  , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6  , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7  , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8  , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9  , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
##  [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
##  [2] "travel/travel_time_to_cities_u3.tif"        
##  [3] "travel/travel_time_to_cities_u5.tif"        
##  [4] "TRUE/gadm/gadm41_TZA_0_pk.rds"              
##  [5] "TRUE/gadm/gadm41_TZA_1_pk.rds"              
##  [6] "TRUE/gadm/gadm41_TZA_2_pk.rds"              
##  [7] "TRUE/gadm/gadm41_TZA_3_pk.rds"              
##  [8] "TRUE/spam/spam2017v2r1_ssa_area.zip"        
##  [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"    
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"    
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"    
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"    
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"    
## [14] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"    
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"    
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"    
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"    
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"    
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"    
## [20] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"    
## [21] "TRUE/spam/spam2017v2r1_ssa_yield.zip"       
## [22] "TRUE/travel/travel_time_to_cities_u5.tif"   
## [23] "TRUE/travel/travel_time_to_ports_1.tif"     
## [24] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)

tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)

# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)

# text(mypts, label="Market")
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)

Interpolate

## Interpolate

#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)

Predict prices with coodinates only

## Predict Maize prices with coodinates only
maize_mypts <- mypts[mypts$Crop == "Maize", ]
rf <- randomForest(pkg ~ Longitude + Latitude , data=maize_mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)

Covariates

The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.

ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm    <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area   <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield  <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd  <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield)  <- c("MAI_YLD") # SPAM maize yield 2010
names(popd)  <- c("popdens") # GPW4

comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"

comment(popd) <- "population density 2020 (GPW4 @ 10dm)"

comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"

Harmonize rasters to national boundaries and common resolution

ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm    <- resample(clm, r)
area   <- resample(area, r)
popd   <- resample(popd, r)
freq(is.na(area))
##   layer value count
## 1     1     0 14393
## 2     1     1  6343
area <- classify(area, cbind(NA,0)) 
yield  <- resample(yield, r)
freq(is.na(yield))
##   layer value count
## 1     1     0 14393
## 2     1     1  6343
yield <- classify(yield, cbind(NA,0)) 
# check again 
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE

Generate Latitude and Longitude grid

latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")

Prepare Predictor Stack

rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
##  [1] "ttcity_u5" "ttport_1"  "bio_1"     "bio_2"     "bio_3"     "bio_4"    
##  [7] "bio_5"     "bio_6"     "bio_7"     "bio_8"     "bio_9"     "bio_10"   
## [13] "bio_11"    "bio_12"    "bio_13"    "bio_14"    "bio_15"    "bio_16"   
## [19] "bio_17"    "bio_18"    "bio_19"    "MAI_ARE"   "MAI_YLD"   "popdens"  
## [25] "latitude"  "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
                 expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE) 
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius  - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
                 expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE) 

Lag Rainfall

Bring in Rainfall Data from Chirps

chirps_path <- "H:/Tanzania Price data/chirps_data"

chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)

# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)

#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)

writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)

Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class       : SpatRaster 
## dimensions  : 215, 222, 48  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source      : Tz_chirps_monthly_croped.tif 
## names       : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ... 
## min values  :  -9999.0000,  -9999.0000,   -9999.000,  -9999.0000,  -9999.0000,  -9999.0000, ... 
## max values  :    459.7112,    504.3329,     508.448,    585.3241,    750.1949,    960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))

#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
##  [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
##  [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
##  [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06" "chirps-v2.0.2024.07"
## [46] "chirps-v2.0.2024.08" "chirps-v2.0.2024.09" "chirps-v2.0.2024.10"
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
##  [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
##  [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01" "2024-07-01"
## [46] "2024-08-01" "2024-09-01" "2024-10-01"
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates

#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates

# Check the SpatRaster object
print(Tz_chirps_monthly)
## class       : SpatRaster 
## dimensions  : 215, 222, 48  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## varname     : Tz_chirps_monthly_croped 
## names       : 2020-11-01, 2020-12-01,  2021-01-01,  2021-02-01,  2021-03-01, 2021-04-01, ... 
## min values  :   5.859746,   1.903993,   0.8161677,   0.3187093,   0.9539621,    1.17571, ... 
## max values  : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463,  960.33765, ... 
## time (days) : 2020-11-01 to 2024-10-01
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
                                  expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class       : SpatRaster 
## dimensions  : 215, 222, 48  (nrow, ncol, nlyr)
## resolution  : 0.05, 0.05  (x, y)
## extent      : 29.35, 40.45, -11.75, -1.000001  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## varname     : Tz_chirps_monthly_croped 
## names       : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ... 
## min values  :   20.79649,   27.27837,   8.092316,   4.489223,   11.30184,   15.74836, ... 
## max values  :  313.14248,  326.88227, 416.590468, 395.422070,  464.67840,  569.81990, ... 
## time (days) : 2020-11-01 to 2024-10-01
Rainfall <- Rainfall_focal_sum_100km
#Resample 
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class       : SpatRaster 
## dimensions  : 144, 144, 48  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ... 
## min values  :   20.89972,    32.9472,   8.116516,   4.527868,   11.83634,   15.90436, ... 
## max values  :  310.41034,   326.4339, 416.389160, 395.367279,  462.19266,  550.26270, ... 
## time (days) : 2020-11-01 to 2024-10-01

Define a function to calculate the 6-month lagged sum of rainfall values.

Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.

calculate_lagged_sum <- function(raster_stack, num_months = 6) {
  # Get the time vector from the raster stack
  time_vector <- time(raster_stack)
  
  # Initialize list to store lagged sum rasters
  lagged_sum_rasters <- vector("list", length(time_vector))
  
  # Loop through each layer in the raster stack
  for (i in seq_along(time_vector)) {
    if (i > num_months) {  # We need at least 'num_months' previous layers to calculate the lagged sum
      # Determine the start and end dates for the lag period
      end_date <- time_vector[i] # Date of the current layer being processed
      start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
      
      # Select the layers that fall within the lag period
      lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
      
      # Calculate the sum of the selected layers
      if (nlyr(lag_period_layers) == num_months) {
        lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
      } else {
        lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack), 
                                        crs = crs(raster_stack), ext = ext(raster_stack), 
                                        vals = NA)  # Use an empty raster with NA values
      }
    } else {
      lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack), 
                                      crs = crs(raster_stack), ext = ext(raster_stack), 
                                      vals = NA)  # Use an empty raster with NA values
    }
  }
  
  # Combine the lagged sum rasters into a single raster stack, excluding empty rasters
  lagged_sum_stack <- rast(lagged_sum_rasters)
  
  # Set the names for the layers in the lagged sum stack
  names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
  
  return(lagged_sum_stack)
}

# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)

# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class       : SpatRaster 
## dimensions  : 144, 144, 42  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ... 
## min values  :   172.7856,   105.5447,   105.4121,   105.1864,   26.20571,   7.070483, ... 
## max values  :  1512.4689,  1355.9669,  1052.9220,  1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")

plot(lagged_rainfall_sum_filtered)

## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class       : SpatRaster 
## dimensions  : 144, 144, 26  (nrow, ncol, nlyr)
## resolution  : 0.08333333, 0.08333333  (x, y)
## extent      : 29, 41, -12, 0  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source(s)   : memory
## names       :    ttcity_u5,  ttport_1,     bio_1,     bio_2,    bio_3,     bio_4, ... 
## min values  : 3.350081e-02,  997.9427,  4.333545,  6.546645, 53.85881,  19.75255, ... 
## max values  : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
#names(rstack)
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
##  [1] "ttcity_u5"               "ttport_1"               
##  [3] "bio_1"                   "bio_2"                  
##  [5] "bio_3"                   "bio_4"                  
##  [7] "bio_5"                   "bio_6"                  
##  [9] "bio_7"                   "bio_8"                  
## [11] "bio_9"                   "bio_10"                 
## [13] "bio_11"                  "bio_12"                 
## [15] "bio_13"                  "bio_14"                 
## [17] "bio_15"                  "bio_16"                 
## [19] "bio_17"                  "bio_18"                 
## [21] "bio_19"                  "MAI_ARE"                
## [23] "MAI_YLD"                 "popdens"                
## [25] "latitude"                "longitude"              
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
## [65] "2024-07-01_rain.sum.lag" "2024-08-01_rain.sum.lag"
## [67] "2024-09-01_rain.sum.lag" "2024-10-01_rain.sum.lag"
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]

Here we extract sum of lag rainfall for each row under the column rain.sum.lag

mypts_df <- as.data.frame(mypts)

# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
  # Extract the rainfall value for the current date
  rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
  return(rain_sum)
}

# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
  # Extract relevant data for the current row
  month <- mypts_df$Month[i]
  year <- mypts_df$Year[i]
  current_date <- paste0(year, "-", sprintf("%02d", month), "-01")  # Format date correctly
  # Pass necessary data to the function
  rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
  # Update the rain.sum.lag column
  mypts_df$rain.sum.lag[i] <- rain_sum
}

# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
##  [1] "Region"       "Market"       "Month"        "Year"         "Latitude"    
##  [6] "Longitude"    "Crop"         "pkg"          "maize"        "rice"        
## [11] "sorghum"      "bmillet"      "fmillet"      "wheat"        "beans"       
## [16] "potato"       "jan"          "feb"          "mar"          "apr"         
## [21] "may"          "jun"          "jul"          "aug"          "sep"         
## [26] "oct"          "nov"          "dec"          "ttcity_u5"    "ttport_1"    
## [31] "bio_1"        "bio_2"        "bio_3"        "bio_4"        "bio_5"       
## [36] "bio_6"        "bio_7"        "bio_8"        "bio_9"        "bio_10"      
## [41] "bio_11"       "bio_12"       "bio_13"       "bio_14"       "bio_15"      
## [46] "bio_16"       "bio_17"       "bio_18"       "bio_19"       "MAI_ARE"     
## [51] "MAI_YLD"      "popdens"      "latitude"     "longitude"    "rain.sum.lag"
#define Month as a factor
#mypts$Month <- as.factor(mypts$Month)
#levels(mypts$Month)

#We'll define month as an interger instead.
# Check to make sure Month is interger
sapply(mypts, class)
##       Region       Market        Month         Year     Latitude    Longitude 
##  "character"  "character"    "integer"    "integer"    "numeric"    "numeric" 
##         Crop          pkg        maize         rice      sorghum      bmillet 
##     "factor"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##      fmillet        wheat        beans       potato          jan          feb 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##          mar          apr          may          jun          jul          aug 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##          sep          oct          nov          dec    ttcity_u5     ttport_1 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##        bio_1        bio_2        bio_3        bio_4        bio_5        bio_6 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##        bio_7        bio_8        bio_9       bio_10       bio_11       bio_12 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##       bio_13       bio_14       bio_15       bio_16       bio_17       bio_18 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##       bio_19      MAI_ARE      MAI_YLD      popdens     latitude    longitude 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
## rain.sum.lag 
##    "numeric"
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize"    "Rice"     "Sorghum"  "B.Millet" "F.Millet" "Wheat"    "Beans"   
## [8] "Potato"

Linear model price Prediction

Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.

# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                 Month +
                 Year + 
                 ttcity_u5 + ttport_1 + 
                 bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + 
                 bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13
               + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
                 MAI_ARE + MAI_YLD + 
                 Longitude + Latitude + 
                 rain.sum.lag,
               data = mypts)

summary(lm_model)
## 
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + 
##     wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + 
##     bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + 
##     bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + 
##     bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + 
##     Latitude + rain.sum.lag, data = mypts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1371.18  -256.31   -15.15   239.45  2295.70 
## 
## Coefficients: (2 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.352e+05  1.045e+04 -32.070  < 2e-16 ***
## maize        -9.316e+01  1.803e+01  -5.166 2.46e-07 ***
## rice          1.344e+03  1.802e+01  74.603  < 2e-16 ***
## sorghum       3.805e+02  1.901e+01  20.015  < 2e-16 ***
## bmillet       4.743e+02  2.028e+01  23.388  < 2e-16 ***
## fmillet       7.742e+02  1.874e+01  41.313  < 2e-16 ***
## wheat         9.343e+02  2.051e+01  45.554  < 2e-16 ***
## beans         1.529e+03  1.804e+01  84.753  < 2e-16 ***
## potato               NA         NA      NA       NA    
## Month        -6.529e-01  1.897e+00  -0.344 0.730686    
## Year          1.586e+02  4.894e+00  32.417  < 2e-16 ***
## ttcity_u5     1.832e+00  1.565e-01  11.704  < 2e-16 ***
## ttport_1     -1.798e+00  1.700e-01 -10.575  < 2e-16 ***
## bio_1        -2.311e+03  1.879e+02 -12.298  < 2e-16 ***
## bio_2        -2.291e+03  2.166e+02 -10.580  < 2e-16 ***
## bio_3         2.032e+02  1.813e+01  11.213  < 2e-16 ***
## bio_4         1.540e+01  3.835e+00   4.015 6.01e-05 ***
## bio_5         1.719e+03  1.682e+02  10.216  < 2e-16 ***
## bio_6        -1.637e+03  1.751e+02  -9.351  < 2e-16 ***
## bio_7                NA         NA      NA       NA    
## bio_8         8.409e+02  9.086e+01   9.255  < 2e-16 ***
## bio_9        -7.335e+01  7.384e+01  -0.993 0.320559    
## bio_10       -8.061e+02  1.974e+02  -4.084 4.48e-05 ***
## bio_11        2.300e+03  2.842e+02   8.094 6.83e-16 ***
## bio_12        1.090e+00  2.647e-01   4.116 3.90e-05 ***
## bio_13        7.968e-01  8.857e-01   0.900 0.368304    
## bio_14        4.350e+01  1.147e+01   3.792 0.000151 ***
## bio_15        1.844e+01  2.021e+00   9.128  < 2e-16 ***
## bio_16       -1.522e+00  6.773e-01  -2.247 0.024658 *  
## bio_17       -1.242e+01  3.270e+00  -3.799 0.000147 ***
## bio_18        4.681e-02  2.382e-01   0.197 0.844221    
## bio_19        2.942e+00  2.570e+00   1.145 0.252402    
## MAI_ARE       1.628e-02  2.852e-02   0.571 0.568054    
## MAI_YLD       1.095e-01  1.527e-02   7.173 8.12e-13 ***
## Longitude     5.937e+01  3.237e+01   1.834 0.066682 .  
## Latitude     -2.043e+01  3.163e+01  -0.646 0.518352    
## rain.sum.lag -1.318e-01  1.865e-02  -7.065 1.77e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 389.5 on 6561 degrees of freedom
## Multiple R-squared:  0.7194, Adjusted R-squared:  0.718 
## F-statistic: 494.8 on 34 and 6561 DF,  p-value: < 2.2e-16

RandomForest and TPS

Random Forest price prediction

First check if there are any predictors with NA values

for(column in seq_along(mypts)){
  if(any(is.na(mypts[column]))){
    print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
  }
}

#There are no columns with missing values

Split data the to be used for Training and validation

Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.

mypts <- as.data.frame(mypts)

# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
#head(training_data)
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
#head(validation_data)

Random Forest for generating variable of importance

Tune The Forest

The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.

# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)

trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
              y=mypts_df$pkg) # Response variable
## mtry = 18  OOB error = 1407.873 
## Searching left ...
## mtry = 9     OOB error = 7242.192 
## -4.144066 0.05 
## Searching right ...
## mtry = 36    OOB error = 162.1689 
## 0.8848129 0.05 
## mtry = 55    OOB error = 98.42907 
## 0.3930459 0.05

Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.

(mintree <- trf[which.min(trf[,2]),1])
## [1] 55

Fit The Random Forest Model (1)

Random Forest for generating variable of importance

Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.

# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                      Month + 
                      Year +
                      ttcity_u5 + ttport_1 + 
                      bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + 
                      bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 +
                      bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + 
                      bio_18 + bio_19 + MAI_ARE + MAI_YLD + 
                      Longitude + Latitude + 
                      rain.sum.lag,
                    data = training_data,mtry=mintree,
                    importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + potato + Month + Year + ttcity_u5 +      ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +      bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 +      bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE +      MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data,      mtry = mintree, importance = TRUE, na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 36
## 
##           Mean of squared residuals: 24140.83
##                     % Var explained: 95.37
varImpPlot(rf1)

## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 154.9753

This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.

importance_metrics <- importance(rf1, type=1)  # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
##  [1] "Year"         "Month"        "beans"        "rice"         "sorghum"     
##  [6] "Longitude"    "rain.sum.lag" "bmillet"      "wheat"        "fmillet"     
## [11] "bio_12"       "bio_18"       "MAI_YLD"      "bio_15"       "MAI_ARE"     
## [16] "Latitude"     "bio_3"        "ttcity_u5"    "ttport_1"     "bio_4"
node_purity <- importance(rf1, type=2)  # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
##  [1] "rice"         "beans"        "Year"         "wheat"        "fmillet"     
##  [6] "Month"        "potato"       "bio_12"       "maize"        "Longitude"   
## [11] "rain.sum.lag" "sorghum"      "bmillet"      "bio_3"        "bio_7"       
## [16] "MAI_YLD"      "bio_18"       "ttcity_u5"    "MAI_ARE"      "Latitude"
rf1$importanceSD
##        maize         rice      sorghum      bmillet      fmillet        wheat 
##    1955.5603    2879.1917     452.8803     612.4819    1711.3199    1616.8257 
##        beans       potato        Month         Year    ttcity_u5     ttport_1 
##    3061.0326    1929.0591     258.5696     548.4348     263.7380     254.9017 
##        bio_1        bio_2        bio_3        bio_4        bio_5        bio_6 
##     139.0556     278.9766     593.3162     168.4609     197.7081     307.8925 
##        bio_7        bio_8        bio_9       bio_10       bio_11       bio_12 
##     749.3357     117.0002     272.9865     155.8555     157.6274     821.6118 
##       bio_13       bio_14       bio_15       bio_16       bio_17       bio_18 
##     153.2510     205.3050     266.3121     149.6587     157.4320     251.5292 
##       bio_19      MAI_ARE      MAI_YLD    Longitude     Latitude rain.sum.lag 
##     193.6378     258.2230     417.8781     366.7401     307.8064     191.8319

Fit The Random Forest Model (2)

Estimate more parsimonious specification

In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.

# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
                     Month +
                     Year + 
                     ttport_1 +
                     popdens + 
                     bio_3 + bio_6  + bio_9 + bio_12 + 
                     rain.sum.lag, 
                   data=training_data, na.rm=TRUE)

rf
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + potato + Month + Year + ttport_1 +      popdens + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag,      data = training_data, na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 30062.28
##                     % Var explained: 94.24
# evaluate
varImpPlot(rf)

(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 173.3164
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")

spatial prediction

These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.

year <- 2024

Note: we must set the rain.sum.lag variables for each month we are predicting

Predicted Maize Prices

#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_maize <- data.frame(
    maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_maize <- lapply(1:10, predict_for_month)


# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 100 

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 100

# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0))  # Set margins to 0 for inner plots
for (i in 1:10) {
  plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
       zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)

# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9,
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
           legend.args = list(text = "Predicted Maize Price (TZS Per Kg)", side = 1, line = 2, cex = 0.9))

Predicted Beans Prices

# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"

  newstack <- c(rstack, rain_sum_lag)

  const_beans <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
    Month = month,
    Year = year
  )

  pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_beans <- lapply(1:10, predict_for_beans)

# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 150 

# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
  plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year), 
       zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Beans Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Predicted Rice Prices

# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_rice <- data.frame(
    maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_rice <- lapply(1:10, predict_for_rice)

# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 150 

# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
  plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year), 
       zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1)) 

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Rice Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Predicted Sorghum Prices

# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_sorghum <- data.frame(
    maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = 2023
  )
  
  pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_sorghum <- lapply(1:10, predict_for_sorghum)

# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

break_interval <- 300 

# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
  plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year), 
       zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Sorghum Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Predicted Bulrush Millet Prices

# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_bmillet <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_bmillet <- lapply(1:10, predict_for_bmillet)


# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot bmillet prices
break_interval <- 150 
par(mar = c(0, 0, 0, 0)) 
for (i in 1:10) {
  plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year), 
       zlim = c(min_bmillet, max_bmillet), col = color_palette, 
       breaks = seq(min_bmillet, max_bmillet, by = break_interval), 
       legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5) 

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Bulrush Millet Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Predicted Finger Millet Prices

#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_fmillet <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_fmillet <- lapply(1:10, predict_for_fmillet)

# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot fmillet prices
break_interval <- 300 
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
  plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year), 
       zlim = c(min_fmillet, max_fmillet), col = color_palette, 
       breaks = seq(min_fmillet, max_fmillet, by = break_interval), 
       legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)  

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Finger Millet Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Predicted Wheat Prices

#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_wheat <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_wheat <- lapply(1:10, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100

# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
  plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year), 
       zlim = c(min_wheat, max_wheat), col = color_palette, 
       breaks = seq(min_wheat, max_wheat, by = break_interval), 
       legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  

# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Wheat Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Predicted potato prices

#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
  rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
  names(rain_sum_lag) <- "rain.sum.lag"
  
  newstack <- c(rstack, rain_sum_lag)
  
  const_potato <- data.frame(
    maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
    Month = month,
    Year = year
  )
  
  pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
  pred <- mask(pred, tza1)
  return(pred)
}

# Create predictions for all months
predictions_potato <- lapply(1:10, predict_for_potato)

# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)

# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))

# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
                          5, 6, 7, 8,
                          9, 10, 11, 12,
                          13, 13, 13, 13), nrow = 4, byrow = TRUE)

# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))

# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
  plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year), 
       zlim = c(min_potato, max_potato), col = color_palette, 
       breaks = seq(min_potato, max_potato, by = break_interval), 
       legend = FALSE, axes = FALSE)
  # Add region boundaries
  plot(tza1, add = TRUE, border = "black", lwd = 0.1)
  
  points(training_data, pch = 20, col = "red", cex = 0.5)
}

# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5)  # Adjust n as needed

# Reset plot layout to 1x1 for the legend
layout(matrix(1))  

# Set margins for the legend
par(mar = c(5, 4, 2, 1))  
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE, 
           col = color_palette, horizontal = TRUE,
           legend.width = 0.7, legend.shrink = 0.9, 
           axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9), 
           legend.args = list(text = "Predicted Potato Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))

Prediction Evaluation

1. Using 2024 Validation data

pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
## [1] "Mean Squared Error: 156604.920494802"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error:  177.66089985911"
#Save predicted & observed price
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X"         "actual"    "predicted"
#RMSE predicting from rf - predicited vs observed 
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
## [1] 395.73
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,2)
print(rf.r2)
## [1] 0.75
range(actual)
## [1]  350 4050
range(pred)
## [1]  615.6416 3266.9242
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
  geom_point(colour = "blue", size = 1.4, alpha=0.6) +
  ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
  scale_x_continuous("Observed Price (Tsh) Per Kg",
                     limits = c(0, 5000),
                     breaks = seq(0, 5000, 1000)) +
  scale_y_continuous("Predicted Price (Tsh) Per Kg",
                     limits = c(0, 5000),
                     breaks = seq(0, 5000, 1000)) +
  theme(axis.line = element_line(colour = "black"),
        axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
        axis.text.x = element_text(size = 8)) +
  geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
  geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
  annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
  annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

2. Compare the observed Prices (the training data) with the predicted Prices (predicted using the training data) using stats package

library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
  geom_point(colour = "blue", size = 1.4 ,alpha=0.6) + 
  ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
  geom_abline(slope = 1, alpha=0.3) +
  annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3)  +
  annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3)  +
  labs(x = "Observed Price (Tsh) Per Kg", y = "Predicted Price (Tsh) Per Kg") +
  xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot

Partial dependence plots

Plot Partial dependence of all the variables used except food commodities and months.

library(caret)

var_importance <- varImp(rf)

impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))

for (i in seq_along(predictors_to_plot)) {
  partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
              main="Partial Dependence")
}

Descriptive statistics for each crop

# Compute descriptive statistics for each crop
stats <- mypts %>%
  filter(pkg > 0) %>%  
  group_by(Crop) %>%
  summarise(
    Mean = mean(pkg, na.rm = TRUE),
    Median = median(pkg, na.rm = TRUE),
    Minimum = min(pkg, na.rm = TRUE),
    Maximum = max(pkg, na.rm = TRUE),
    Std_Dev = sd(pkg, na.rm = TRUE),
    IQR = IQR(pkg, na.rm = TRUE),
    Observations = n()
  ) %>%
  arrange(Crop)

print(stats)
## # A tibble: 8 × 8
##   Crop      Mean Median Minimum Maximum Std_Dev   IQR Observations
##   <fct>    <dbl>  <dbl>   <dbl>   <dbl>   <dbl> <dbl>        <int>
## 1 Maize     773.   726.    268.   1722.    266.  438.          946
## 2 Rice     2213.  2200     800    3518.    569.  920.          948
## 3 Sorghum  1263.  1250     420    3250     442.  642.          783
## 4 B.Millet 1355.  1250     469.   4000     515.  785.          632
## 5 F.Millet 1642.  1700     700    3000     345.  450           817
## 6 Wheat    1789.  1800     368.   4050     545.  569.          604
## 7 Beans    2397.  2420     985    3850     561.  952.          944
## 8 Potato    871.   850     300    2000     255.  251.          922

correlation Plots

These are correlation plots for pooled sample in two periods: post-harvest (May-Oct) and lean season (Nov-April).

#We'll use mean Monthly data in wide format
prices.monthly
##             Region     Market Month Year Latitude Longitude mai.price ric.price
##   1:        Arusha     Arusha     5 2021 -3.36968  36.68808    469.00  1530.000
##   2: Dar es Salaam     Temeke     5 2021 -6.85097  39.25672    485.00  1650.000
##   3:        Dodoma    Majengo     5 2021 -6.17971  35.74109    501.50  1592.000
##   4:        Dodoma   Kibaigwa     5 2021 -6.08110  36.64645    375.00       NaN
##   5:        Kagera     Bukoba     5 2021 -1.33140  31.81293    426.25  1260.000
##  ---                                                                           
## 950:        Mwanza     Mwanza    10 2024 -2.51969  32.90144    735.00  2016.667
## 951:         Pwani    Mnalani    10 2024 -6.44221  38.90622    925.00  1925.000
## 952:        Simiyu    Bariadi    10 2024 -2.80355  33.98699    667.50  1466.667
## 953:        Kigoma     Kigoma    10 2024 -4.89697  29.65987    667.00  1750.000
## 954:         Rukwa Sumbawanga    10 2024 -7.95239  31.62319    651.25  1916.667
##      sor.price bul.price fin.price whe.price bea.price pot.price
##   1:   695.000   761.000  1333.000     793.0  1545.000  745.0000
##   2:   900.000   900.000  1775.000    1230.0  2420.000  730.0000
##   3:   558.000   581.000  1752.000       NaN  2075.000  618.0000
##   4:   437.500       NaN       NaN       NaN       NaN       NaN
##   5:  1375.000  1337.500  1637.500    1637.5  1300.000  675.0000
##  ---                                                            
## 950:  1416.667  1283.333  1583.333       NaN  2800.000  991.6667
## 951:  1525.000  1800.000  2016.667    2000.0  2920.833 1500.0000
## 952:  1420.833  1825.000  1825.000    2500.0  2845.833 1450.0000
## 953:  1000.000  1500.000  2000.000    1700.0  2025.000  900.0000
## 954:       NaN       NaN   975.000     725.0  2437.500  670.8333
# Create a function to determine the season
get_season <- function(month) {
  if (month %in% 5:10) {
    return("Post-Harvest")
  } else {
    return("Lean Season")
  }
}
# Add a 'Season' column to the data
prices.monthly$Season <- sapply(prices.monthly$Month, get_season)
# Post Harvest Data
post_harvest_data <- prices.monthly[prices.monthly$Season == "Post-Harvest", ]
# Remove rows with NaNs from the Post-Harvest data
post_harvest_data <- post_harvest_data[complete.cases(post_harvest_data[, 7:14]), ]

# Lean Season data
lean_season_data <- prices.monthly[prices.monthly$Season == "Lean Season", ]
# Remove rows with NaNs from the Lean Season data
lean_season_data <- lean_season_data[complete.cases(lean_season_data[, 7:14]), ]
# Calculate correlation matrix for Post-Harvest season
post_harvest_corr <- cor(post_harvest_data[, 7:14])
# Calculate correlation matrix for Lean Season
lean_season_corr <- cor(lean_season_data[, 7:14])
# Plot correlation matrix for Post-Harvest season
corrplot(post_harvest_corr, 
         method = "color",          
         title = "",                
         tl.col = "black",          
         tl.cex = 0.5,             
         addCoef.col = "black",     
         number.cex = 0.5,         
         number.digits = 2) 

# Add title to the plot
title(main = "Post-Harvest Correlation Matrix", 
      line = 3,                
      cex.main = 0.9)

# Plot correlation matrix for Lean Season
corrplot(lean_season_corr, 
         method = "color", 
         title = "",
         tl.col = "black",       
         tl.cex = 0.5, 
         addCoef.col = "black",
         number.cex = 0.5,
         number.digits = 2)

# Add title to the plot
title(main = "Lean Season Correlation Matrix", 
      line = 3,                
      cex.main = 0.9) 

Comparing Pooled Vs Crop Specific Price Predictions Using different validation methods

We are comparing pooled vs Crop Specific Price Prediction models to determine which model performs better at prediction. This is done by comparing their prediction’s r2 or rmse respectively.

Function to evaluate models for each crop using 2024 Data as the validation data

Here we are using 2024 data for validation.

# Comparison between Pooled model and Crop Specific Model
set.seed(1983)
evaluate_models <- function(crop) {
  # Filter data for the specific crop
  crop_data <- mypts[mypts$Crop == crop, ]
  training_data_crop <- crop_data[crop_data$Year %in% c(2021, 2022, 2023), ]
  training_data_crop <- as.data.frame(training_data_crop)
  validation_data_crop <- crop_data[crop_data$Year == 2024, ]
  validation_data_crop <- as.data.frame(validation_data_crop)
  
  # Predictions for pooled model on specific crop data
  pooled_pred_crop <- predict(rf, newdata = validation_data_crop)
  actual_pooled <- validation_data_crop$pkg
  result_pooled <-data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
  
  # Crop-Specific RF Model
  rf_crop <- randomForest(pkg ~ Month + Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag,
                          data = training_data_crop, na.rm = TRUE)
  
  # Predictions for crop-specific model
  predictions_crop <- predict(rf_crop, newdata = validation_data_crop)
  actual_crop <- validation_data_crop$pkg
  result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
  
  # Calculate performance metrics
  rmse_pooled <- round(sqrt(mean((result_pooled$actual-result_pooled$predicted)^2 , na.rm = TRUE )),2)
  r2_pooled <- round(summary(lm(actual~predicted, result_pooled))$r.squared,2)
  rmse_crop <- round(sqrt(mean((result_crop_specific$actual-result_crop_specific$predicted)^2 , na.rm = TRUE )),2)
  r2_crop <- round(summary(lm(actual~predicted, result_crop_specific))$r.squared,2)
  
  return(data.frame(Crop = crop,
                    Model = c("Pooled", "Crop-Specific"),
                    RMSE = c(rmse_pooled, rmse_crop),
                    R_squared = c(r2_pooled, r2_crop)))
}
# Apply the function to all crops
crop <- unique(mypts$Crop)
comparison_df <- do.call(rbind, lapply(crop, evaluate_models))
comparison_df
##        Crop         Model   RMSE R_squared
## 1     Maize        Pooled 335.05      0.34
## 2     Maize Crop-Specific 339.85      0.33
## 3      Rice        Pooled 557.04      0.32
## 4      Rice Crop-Specific 567.37      0.28
## 5   Sorghum        Pooled 318.42      0.53
## 6   Sorghum Crop-Specific 321.36      0.51
## 7  B.Millet        Pooled 432.54      0.40
## 8  B.Millet Crop-Specific 432.56      0.40
## 9  F.Millet        Pooled 312.51      0.34
## 10 F.Millet Crop-Specific 312.73      0.34
## 11    Wheat        Pooled 482.15      0.42
## 12    Wheat Crop-Specific 493.01      0.38
## 13    Beans        Pooled 352.01      0.35
## 14    Beans Crop-Specific 364.02      0.34
## 15   Potato        Pooled 320.01      0.00
## 16   Potato Crop-Specific 327.89      0.00
#write.csv(comparison_df, "model_comparison_df.csv")
#comparison_df <- read.csv("model_comparison_df.csv")
comparison_df_wide <- comparison_df %>%
  pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
  rename(
    RMSE_Crop_Specific = `RMSE_Crop-Specific`,
    RMSE_Pooled = `RMSE_Pooled`,
    R_squared_Crop_Specific = `R_squared_Crop-Specific`,
    R_squared_Pooled = `R_squared_Pooled`
  ) %>%
  group_by(Crop) %>%
  summarize(
    RMSE_Pooled = max(RMSE_Pooled, na.rm = TRUE),
    RMSE_Crop_Specific = max(RMSE_Crop_Specific, na.rm = TRUE),
    R_squared_Pooled = max(R_squared_Pooled, na.rm = TRUE),
    R_squared_Crop_Specific = max(R_squared_Crop_Specific, na.rm = TRUE)
  )
knitr::kable(comparison_df_wide, caption = "Model Comparison: R² and RMSE")
Model Comparison: R² and RMSE
Crop RMSE_Pooled RMSE_Crop_Specific R_squared_Pooled R_squared_Crop_Specific
Maize 335.05 339.85 0.34 0.33
Rice 557.04 567.37 0.32 0.28
Sorghum 318.42 321.36 0.53 0.51
B.Millet 432.54 432.56 0.40 0.40
F.Millet 312.51 312.73 0.34 0.34
Wheat 482.15 493.01 0.42 0.38
Beans 352.01 364.02 0.35 0.34
Potato 320.01 327.89 0.00 0.00

“train-test split” validation

Here the dataset is split once (70:30). The model is trained on one part of the data (70%) and evaluated on the remaining set of the data (30%).

# Pooled RF model
##Split the data into train and test datasets
set.seed(1983)
rows <- sample(x=1:nrow(mypts), size = 0.70* nrow(mypts))
train <- mypts[rows, ]
test <- mypts[! rownames(mypts) %in% rows, ]

RF <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans +
                     Month +
                     Year + 
                     ttport_1 +
                     popdens +
                     bio_3 + bio_6 + bio_9 +  bio_12 + bio_18 + 
                     rain.sum.lag, 
                   data=train, na.rm=TRUE)

RF
## 
## Call:
##  randomForest(formula = pkg ~ maize + rice + sorghum + bmillet +      fmillet + wheat + beans + Month + Year + ttport_1 + popdens +      bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag, data = train,      na.rm = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 47782.6
##                     % Var explained: 91.02
evaluate_models <- function(crop) {
  # Filter data for the specific crop
  crop_data <- mypts[mypts$Crop == crop, ]
  set.seed(1983)
  rownames(crop_data)<-1:nrow(crop_data)
  rows <- sample(x=1:nrow(crop_data), size = 0.70* nrow(crop_data))
  
  training_data_crop <- crop_data[rows, ]

  validation_data_crop <- crop_data[! rownames(crop_data) %in% rows, ]
  
  # Predictions for pooled model on specific crop data
  pooled_pred_crop <- predict(RF, newdata = validation_data_crop)
  actual_pooled <- validation_data_crop$pkg
  result_pooled <-data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
  
  # Crop-Specific Model
  rf_crop <- randomForest(pkg ~ Month + Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag,
                          data = training_data_crop, na.rm = TRUE)
  
  # Predictions for crop-specific model
  predictions_crop <- predict(rf_crop, newdata = validation_data_crop)
  actual_crop <- validation_data_crop$pkg
  result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
  
  # Calculate performance metrics
  rmse_pooled <- round(sqrt(mean((result_pooled$actual-result_pooled$predicted)^2 , na.rm = TRUE )),2)
  r2_pooled <- round(summary(lm(actual~predicted, result_pooled))$r.squared,2)
  rmse_crop <- round(sqrt(mean((result_crop_specific$actual-result_crop_specific$predicted)^2 , na.rm = TRUE )),2)
  r2_crop <- round(summary(lm(actual~predicted, result_crop_specific))$r.squared,2)
  
  return(data.frame(Crop = crop,
                    Model = c("Pooled", "Crop-Specific"),
                    RMSE = c(rmse_pooled, rmse_crop),
                    R_squared = c(r2_pooled, r2_crop)))
}
# Apply function to all crops
crop <- unique(mypts$Crop)
comparison_df3 <- do.call(rbind, lapply(crop, evaluate_models))

comparison_df3
##        Crop         Model   RMSE R_squared
## 1     Maize        Pooled  95.80      0.89
## 2     Maize Crop-Specific 113.78      0.83
## 3      Rice        Pooled 151.50      0.94
## 4      Rice Crop-Specific 182.37      0.90
## 5   Sorghum        Pooled 173.56      0.87
## 6   Sorghum Crop-Specific 209.83      0.77
## 7  B.Millet        Pooled 164.25      0.89
## 8  B.Millet Crop-Specific 224.89      0.77
## 9  F.Millet        Pooled 155.67      0.80
## 10 F.Millet Crop-Specific 208.40      0.62
## 11    Wheat        Pooled 162.06      0.92
## 12    Wheat Crop-Specific 231.81      0.83
## 13    Beans        Pooled 173.85      0.92
## 14    Beans Crop-Specific 212.28      0.87
## 15   Potato        Pooled 170.21      0.69
## 16   Potato Crop-Specific 137.18      0.73
comparison_df_wide3 <- comparison_df3 %>%
  pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
  rename(RMSE_Crop_Specific = `RMSE_Crop-Specific`,
         RMSE_Pooled = `RMSE_Pooled`,
         R_squared_Crop_Specific = `R_squared_Crop-Specific`,
         R_squared_Pooled = `R_squared_Pooled`)
knitr::kable(comparison_df_wide3, caption = "Model Comparison: R² and RMSE Based on Train-Test Split Validation")
Model Comparison: R² and RMSE Based on Train-Test Split Validation
Crop RMSE_Pooled RMSE_Crop_Specific R_squared_Pooled R_squared_Crop_Specific
Maize 95.80 113.78 0.89 0.83
Rice 151.50 182.37 0.94 0.90
Sorghum 173.56 209.83 0.87 0.77
B.Millet 164.25 224.89 0.89 0.77
F.Millet 155.67 208.40 0.80 0.62
Wheat 162.06 231.81 0.92 0.83
Beans 173.85 212.28 0.92 0.87
Potato 170.21 137.18 0.69 0.73

Leave-One-Market-Out Cross-Validation

# Predicting For markets
mypts <- as.data.frame(mypts)


# Initialize a list to store results for each market
results <- list()

# Get unique markets
markets <- unique(mypts$Market)

# Loop over each market
for (market in markets) {
  
  # Split data: exclude current market for training, and use current market for validation
  train_mkts <- mypts %>% filter(Market != market)
  valid_mkt <- mypts %>% filter(Market == market)
  
  # Train Random Forest model on the training data
  rf_model <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + 
                             jan + feb + mar + apr + may + jun + jul + aug + sep + oct + nov + dec +
                             Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + 
                             rain.sum.lag, 
                           data = train_mkts, na.rm = TRUE)
  
  # Predict on the validation set for the current market
  predictions <- predict(rf_model, newdata = valid_mkt)
  actual <- valid_mkt$pkg
  
  # Calculate RMSE and R-squared for the current market
  rmse <- round(sqrt(mean((actual - predictions)^2, na.rm = TRUE)), 2)
  r2 <- round(summary(lm(actual ~ predictions))$r.squared, 2)
  
  # Store the results in a data frame for the current market
  results[[market]] <- data.frame(Market = market, RMSE = rmse, R_squared = r2)
}

# write.csv(results_df, "Results_R2_RMSE.csv")
results_df <- do.call(rbind, results)
knitr::kable(results_df, caption = "Leave-One-Market-Out Cross-Validation")
Leave-One-Market-Out Cross-Validation
Market RMSE R_squared
Arusha Arusha 219.34 0.92
Temeke Temeke 214.50 0.93
Majengo Majengo 348.17 0.85
Kibaigwa Kibaigwa 272.77 0.94
Bukoba Bukoba 318.05 0.78
Babati Babati 233.73 0.87
Sumbawanga Sumbawanga 433.17 0.76
Mpanda Mpanda 396.63 0.76
Mtwara Mtwara 482.25 0.68
Tabora Tabora 217.31 0.90
Tanga Tanga 295.37 0.81
Kinondoni Kinondoni 196.30 0.93
Ilala Ilala 247.43 0.93
Iringa Iringa 388.58 0.74
Kigoma Kigoma 316.41 0.78
Morogoro Morogoro 260.77 0.86
Mwanza Mwanza 307.11 0.81
Musoma Musoma 420.12 0.55
Songea Songea 422.90 0.83
Shinyanga Shinyanga 389.33 0.74
Moshi Moshi 363.48 0.84
Mbeya Mbeya 307.84 0.85
Njombe Njombe 352.91 0.84
Lindi Lindi 727.47 0.57
Manyara Manyara 327.49 0.79
Tandika Tandika 216.96 0.95
Buguruni Buguruni 154.29 0.99
Tandale Tandale 218.89 0.95
Singida Singida 219.75 0.97
Pwani Pwani 349.67 0.75
Bariadi Bariadi 248.45 0.85
Mpimbwe Mpimbwe 457.23 0.77
Nyankumbu Nyankumbu 428.87 0.71
Songwe Songwe 500.93 0.50
Mwananyamala Mwananyamala 226.61 0.93
Namfua Namfua 168.57 0.95
Kilombero Kilombero 203.79 0.92
Mgandini Mgandini 356.93 0.62
majengo majengo 245.12 0.92
Igawilo Igawilo 291.08 0.83
Mnalani Mnalani 139.41 0.95
Ubungo Ubungo 285.63 0.97
# Calculate the mean and standard deviation for RMSE and R-squared
mean_rmse <- mean(results_df$RMSE)
sd_rmse <- sd(results_df$RMSE)
mean_r2 <- mean(results_df$R_squared)
sd_r2 <- sd(results_df$R_squared)
summary_table <- data.frame(
  Metric = c("RMSE", "R-squared"),
  Mean = c(mean_rmse, mean_r2),
  SD = c(sd_rmse, sd_r2)
)

knitr::kable(summary_table, caption = "mean and standard deviation for RMSE and R-squared of LOMO-CV")
mean and standard deviation for RMSE and R-squared of LOMO-CV
Metric Mean SD
RMSE 313.6097619 113.0821921
R-squared 0.8283333 0.1196319
results_df <- read.csv("Results_R2_RMSE.csv")
# Convert to spatial data
cods_vect <- terra::vect(results_df, geom = c("Longitude", "Latitude"), crs = crs(tza1), keepgeom=FALSE)

cods_sf <- sf::st_as_sf(cods_vect)

tza1_sf <- sf::st_as_sf(tza1)

# Plot R^2 values on the Tanzania map
ggplot(data = tza1_sf) +
  # Base map with Tanzania's boundaries
  geom_sf(fill = "white", color = "gray") +
  
  # Add market-level points colored and sized by R-squared values
  geom_sf(data = cods_sf, aes(size = R_squared, color = R_squared), alpha = 0.8) +
  
  scale_color_viridis_c(option = "C", name = "R-squared") +
  
  scale_size_continuous(range = c(2, 10), name = "R-squared") +
  
  guides(
    size = guide_legend(order = 1),   
    color = guide_legend(order = 1)   
  ) +
  
  labs(title = "Market-Level R-squared Values",
       x = "Longitude", y = "Latitude") +
  
  theme_minimal()

Extracting Predicted prices to OSM Point DATA

The OSM dataset used was obtained from Tanzania Populated Places (OpenStreetMap Export) - https://data.humdata.org/dataset/hotosm_tza_populated_places?

# Extract Predicted price using OSM Data
# Bring in Tanzania Populated Places from OSM
TZ_populated_places <- vect("H:\\Tanzania Price data\\Datasets\\OSM_Population\\hotosm_tza_populated_places_points_shp.shp")
# head(TZ_populated_places)
# Plot All data points
plot(tza1)
plot(TZ_populated_places, col="Red", add=TRUE)

sapply(TZ_populated_places, class)
##        name     name_en       place  population       is_in      source 
## "character" "character" "character" "character" "character" "character" 
##     name_sw      osm_id    osm_type 
## "character"   "integer" "character"
# Ensure that the population column is numeric
TZ_populated_places$population <- as.numeric(TZ_populated_places$population)
# Filter to remain with rows where population is 20,000 or more
TZ_populated_places_filtered <- TZ_populated_places_filtered <- TZ_populated_places[!is.na(TZ_populated_places$population) & TZ_populated_places$population >= 20000, ]
# Plot places where population data is provided
plot(tza1)
plot(TZ_populated_places_filtered, col="Red", add=TRUE)

# Read the predicted price raster data for Maize
output_dir_Maize <- "H:/Tanzania Price data/Datasets/Pred-plots/Maize"

Maize_price_list <- list.files(output_dir_Maize, pattern = ".tif$", full.names = TRUE)
Maize_price_list
##  [1] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_01.tif"
##  [2] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_02.tif"
##  [3] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_03.tif"
##  [4] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_04.tif"
##  [5] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_05.tif"
##  [6] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_06.tif"
##  [7] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_07.tif"
##  [8] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_08.tif"
##  [9] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_09.tif"
## [10] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_10.tif"
Maize_price_rast <- rast(Maize_price_list)
# Extract predicted maize prices for OSM points
osm_points_with_prices <- TZ_populated_places_filtered  

# Loop over each month and extract the maize price for OSM points
for (i in 1:nlyr(Maize_price_rast)) {
  # Extract prices for month i
  prices_for_month <- terra::extract(Maize_price_rast[[i]], TZ_populated_places_filtered, method = "bilinear")
  
  # Add the extracted prices to the SpatVector (each month as a separate column)
  osm_points_with_prices[[paste0("maize_price_month_", sprintf("%02d", i))]] <- prices_for_month[,2]
}
# Filter to remain only with necessary columns
osm_points_with_prices_select <- osm_points_with_prices[, c("name",
"place", "is_in", "population", "osm_id", "maize_price_month_01", "maize_price_month_02", "maize_price_month_03",  "maize_price_month_04", 
"maize_price_month_05", "maize_price_month_06", "maize_price_month_07", 
"maize_price_month_08")]
osm_points_with_prices_select <- as.data.frame(osm_points_with_prices_select)

# Reshape to long format with a Month and price columns
osm_points_long <- osm_points_with_prices_select %>%
  pivot_longer(cols = starts_with("maize_price_month_"), 
               names_to = "Month", 
               values_to = "price")
# Clean month column to extract only the month number
osm_points_long$Month <- as.numeric(gsub("maize_price_month_", "", osm_points_long$Month))

# Add a new column crop with Maize 
osm_points_long$crop <- "maize"

osm_points_long <- as.data.table(osm_points_long)
ppts <- head(osm_points_long, 50)
knitr::kable(ppts, caption = "Extracted Maize Prices For Different Markets in Tanzania For the Year 2024")
Extracted Maize Prices For Different Markets in Tanzania For the Year 2024
name place is_in population osm_id Month price crop
Geita city NA 318006 258307668 1 1071.4069 maize
Geita city NA 318006 258307668 2 1040.3415 maize
Geita city NA 318006 258307668 3 1032.7237 maize
Geita city NA 318006 258307668 4 968.8754 maize
Geita city NA 318006 258307668 5 978.2965 maize
Geita city NA 318006 258307668 6 995.3333 maize
Geita city NA 318006 258307668 7 994.3241 maize
Geita city NA 318006 258307668 8 984.8319 maize
Mafinga town Iringa, Tanzania 99305 258505599 1 954.2232 maize
Mafinga town Iringa, Tanzania 99305 258505599 2 916.2251 maize
Mafinga town Iringa, Tanzania 99305 258505599 3 895.5600 maize
Mafinga town Iringa, Tanzania 99305 258505599 4 841.1950 maize
Mafinga town Iringa, Tanzania 99305 258505599 5 810.0676 maize
Mafinga town Iringa, Tanzania 99305 258505599 6 807.7249 maize
Mafinga town Iringa, Tanzania 99305 258505599 7 800.8484 maize
Mafinga town Iringa, Tanzania 99305 258505599 8 802.6259 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 1 1097.4390 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 2 1077.4080 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 3 1058.0266 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 4 954.7785 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 5 952.6296 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 6 973.3632 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 7 977.1048 maize
Masumbwe town Shinyanga, Tanzania 50000 259713894 8 978.0525 maize
Vwawa town Mbeya, Tanzania 85000 262107786 1 964.7503 maize
Vwawa town Mbeya, Tanzania 85000 262107786 2 951.0346 maize
Vwawa town Mbeya, Tanzania 85000 262107786 3 942.9071 maize
Vwawa town Mbeya, Tanzania 85000 262107786 4 913.3199 maize
Vwawa town Mbeya, Tanzania 85000 262107786 5 872.9376 maize
Vwawa town Mbeya, Tanzania 85000 262107786 6 849.7830 maize
Vwawa town Mbeya, Tanzania 85000 262107786 7 818.5362 maize
Vwawa town Mbeya, Tanzania 85000 262107786 8 828.0533 maize
Chake-Chake town NA 52047 622514835 1 1080.6272 maize
Chake-Chake town NA 52047 622514835 2 1089.0785 maize
Chake-Chake town NA 52047 622514835 3 1068.1354 maize
Chake-Chake town NA 52047 622514835 4 1026.9848 maize
Chake-Chake town NA 52047 622514835 5 1084.0845 maize
Chake-Chake town NA 52047 622514835 6 1081.5926 maize
Chake-Chake town NA 52047 622514835 7 1079.0308 maize
Chake-Chake town NA 52047 622514835 8 1055.8905 maize
Babati town NA 67445 -1047512940 1 953.8866 maize
Babati town NA 67445 -1047512940 2 958.2093 maize
Babati town NA 67445 -1047512940 3 959.7328 maize
Babati town NA 67445 -1047512940 4 939.8418 maize
Babati town NA 67445 -1047512940 5 915.3608 maize
Babati town NA 67445 -1047512940 6 915.8868 maize
Babati town NA 67445 -1047512940 7 915.1750 maize
Babati town NA 67445 -1047512940 8 913.8773 maize
Vikindu town NA 70000 1826809646 1 1093.1816 maize
Vikindu town NA 70000 1826809646 2 1083.0636 maize

Market access

Slope

# Slope from geodata
# Extract slope from srtm using terrain function in terra package 
# tza_alt <- elevation_30s("Tanzania", path=".")
# Obtain the slope in radians
# slope_radian <- terrain(tza_alt, "slope", unit="radians", neighbors=8)
# terra::plot(slope_radian, main = "Slope of Tanzania (Radians)")

# Save slope raster
# output_dir <- "H:/Tanzania Price data/Datasets/Cost_surface data"
# output_file <- paste0(output_dir, "/tz_slope_radian.tif")
#writeRaster(slope_radian, output_file, overwrite = TRUE)

# Set Lambert Azimuthal Equal-Area (LAEA) projection centered on Tanzania
laea_tz <- "+proj=laea +lat_0=-6 +lon_0=35 +datum=WGS84 +units=m +no_defs"

# Load and reproject slope layer
slope_radian <- terra::rast("H:/Tanzania Price data/Datasets/Cost_surface data/tz_slope_radian.tif")
# Reproject to LAEA and set resolution to 500m
slope_radian_laea <- terra::project(slope_radian, laea_tz, res = 500, 
                                    filename = "slope_laea_500m.tif", overwrite = TRUE)
terra::plot(slope_radian_laea, main = "Slope of Tanzania (Radians)")

# Use the slope layer obtained above to create a decay coefficient
# We use a decay coefficient of 1.5
# Create slope cost surface
decay <- 1.5
slope_cost <- exp(decay * tan(slope_radian_laea))
names(slope_cost) <- "slope_cost"
terra::plot(slope_cost, main = "Slope cost")

Roads

# Roads Data from OSM
# Obtain roads data from osmdata. Code is in reagro.
# roads <- geodata::osm("Tanzania", "highways", ".")
# writeVector(roads, "H:/Tanzania Price data/Datasets/Cost_surface data/tz_roads.shp", overwrite = TRUE)

# Load and reproject roads
roads <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_roads.shp")
# Reproject the roads vector to LAEA
roads_laea <- terra::project(roads, laea_tz)

terra::plot(slope_radian_laea)
terra::lines(roads_laea, col="black")
terra::lines(roads_laea[roads_laea$highway == "primary", ], lwd=4, col="red")
terra::lines(roads_laea[roads_laea$highway == "secondary", ], lwd=2, col="blue")

creating roads cost surface

# Create road cost surface
# Create road cost surface
cfile <- "rdcost_laea.tif"
roadtypes <- c("primary", "secondary", "tertiary")

if (!file.exists(cfile)) {
  i <- match(roads_laea$highway, roadtypes)
  roads_laea$speed <- c(0.001, 0.0015, 0.002)[i]
  rd_cost <- rasterize(roads_laea, slope_radian, field=roads_laea$speed, filename=cfile, overwrite=TRUE)
} else {
  rd_cost <- terra::rast(cfile)
}


rd_cost_laea <- terra::project(rd_cost, laea_tz,  res = 500, 
                               filename = "rd_cost_laea_500m.tif", overwrite = TRUE)

a <- aggregate(rd_cost_laea, 3, min, na.rm=TRUE)
terra::plot(a, col=c("black", "blue", "red"), main = "Road travel cost (min/m)")

Land Cover

# Load land cover layer
tza_lulc <- terra::rast("H:/Tanzania Price data/Datasets/Cost_surface data/TZ_Land_cover_2021.tif")

plot(tza_lulc, main = "WorldCover 10 m 2021 v200 - landcover classes")
terra::lines(tza1) 

Table_class <- data.frame(
  Value = c(10,20,30,40,50,60,70,80,90, 95),
  LandClass = c("Tree cover",
                "Shrubland",
                "Grassland",
                "Cropland",
                "Built-up",
                "Bare/sparse vegetation",
                "Snow and Ice",
                "Permanent water bodies",
                "Herbaceous wetland",
                "Mangroves"),
  travel_speeds=c(0.04, 0.02, 0.02, 0.01, 0.01, 0.02, 0.04, 0.11, 0.04, 0.05)
)
knitr::kable(Table_class, caption = "Land Cover classes (ESA WorldCover 10 m 2021 v200)")
Land Cover classes (ESA WorldCover 10 m 2021 v200)
Value LandClass travel_speeds
10 Tree cover 0.04
20 Shrubland 0.02
30 Grassland 0.02
40 Cropland 0.01
50 Built-up 0.01
60 Bare/sparse vegetation 0.02
70 Snow and Ice 0.04
80 Permanent water bodies 0.11
90 Herbaceous wetland 0.04
95 Mangroves 0.05
rc <- data.frame(from=unique(tza_lulc)[,1], to=0.02)
rc <- data.frame(from=unique(tza_lulc)[,1], to=0.02)
# Adjust Reclassification Table Based on the Classes
#rc <- data.frame(from = Table_class$Value, to = Table_class$travel_speeds)
# Assign specific travel speeds based on land cover class values
rc$to[rc$from %in% c(10)] <- 0.04   # Tree cover
rc$to[rc$from %in% c(20, 30)] <- 0.02  # Shrubland and Grassland
rc$to[rc$from == 40] <- 0.01  # Cropland
rc$to[rc$from == 50] <- 0.01  # Built-up
rc$to[rc$from == 60] <- 0.02  # Bare/sparse vegetation
rc$to[rc$from == 70] <- 0.04  # Snow and Ice 
rc$to[rc$from == 80] <- 0.11  # Permanent water bodies
rc$to[rc$from == 90] <- 0.04  # Herbaceous wetland
rc$to[rc$from == 95] <- 0.05  # Mangroves
rc
##    from   to
## 1    10 0.04
## 2    20 0.02
## 3    30 0.02
## 4    40 0.01
## 5    50 0.01
## 6    60 0.02
## 7    70 0.04
## 8    80 0.11
## 9    90 0.04
## 10   95 0.05
#reclassifying
tza_lc_reclass <- classify(tza_lulc, rc)
## 
|---------|---------|---------|---------|
=========================================
                                          
lcfname <- "lc_cost.tif"
if (!file.exists(lcfname)) {
  # first aggregate to about the same spatial resolution
  lc_cost <- aggregate(tza_lc_reclass, 3, mean)
  # then resample
  lc_cost <- resample(lc_cost, slope_radian, filename=lcfname, wopt=list(names="lc_cost"), overwrite=TRUE)
} else {
  lc_cost <- rast(lcfname)
}


lc_cost_laea <- terra::project(lc_cost, laea_tz,  res = 500, 
                               filename = "lc_cost_laea_500m.tif", overwrite = TRUE)
terra::plot(lc_cost_laea, main = "Off-road travel costs (min/m) based on land cover class")

# Resample to harmonize the resolution
#lc_cost_resampled <- terra::resample(lc_cost, slope_cost)
#names(lc_cost_resampled) <- "lc_cost"
#rd_cost_resample <- terra::resample(rd_cost, slope_cost)
#names(rd_cost_resample) <- "rd_cost"
# Combine the cost layers
all_cost <- c(rd_cost_laea, lc_cost_laea)
#getting the minimum value of each grid cell
cost <- min(all_cost, na.rm=TRUE)
cost <- cost * slope_cost
terra::plot(cost, main="Final cost layer (min/m)")

# Combine the cost layers
library(gdistance)
cost <- raster(cost)
conductance <- 1/cost
#Creating a transition object
tran <- transition(conductance, transitionFunction=mean, directions= 8)

tran <- geoCorrection(tran, type="c")

save(trans, file = "H:/Tanzania Price data/Datasets/Cost_surface data/transition.rda")
# Towns of 20,000 people or more from OSM
town <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_towns.shp")
# Project the towns to match the CRS of slope_radian
town_prj <- project(town, crs(slope_cost))
# geom(town_prj)
# Extracting coordinates and population from the projected towns
towns <- data.frame(
  x = geom(town_prj)[, 3],  
  y = geom(town_prj)[, 4],  
  population = town_prj$population  # Population from the projected towns
)
# towns
#convert to spatial points needed in gdistance
spTowns <- SpatialPoints(cbind(towns$x, towns$y))
spTowns
## class       : SpatialPoints 
## features    : 94 
## extent      : -584189.9, 567514.8, -558136.2, 532149.3  (xmin, xmax, ymin, ymax)
## crs         : NA

Estimating Access to Markets

#Estimating
Ac <- accCost(tran, fromCoords=spTowns)
A <- rast(Ac) / 60
AA <- clamp(A, 0, 24) |> mask(slope_radian_laea)
## Warning: [mask] CRS do not match
terra::plot(AA, main="Access to markets (towns > 20k) in Tanzania (hrs)")
terra::lines(roads_laea)
terra::points(town_prj, col="red", pch=20, cex=1.0)

Estimating farmgate prices for Tanzania”

#Load trans object that was created
load("H:/Tanzania Price data/Datasets/Cost_surface data/transition.rda")
terra::plot(raster(tran), main='Transition matrix (minutes)')
terra::plot(town_prj, col = "red", cex = 0.5, add=TRUE)

# We'll use towns our >20k from OSM
town1 <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_towns.shp")

Calculating farmgate prices

# Extract predicted maize prices for OSM points
# Lets use January Maize price for now
maize_price_jan <- rast("H:\\Tanzania Price data\\Datasets\\Pred-plots\\Maize\\maize_price_rf_pred_01.tif")
maize_price_jan_laea <- terra::project(maize_price_jan, laea_tz, res = 500)
maize_price_jan_osm <- terra::extract(maize_price_jan_laea, town_prj, fun=mean, buffer=2500, small=TRUE)
# the extract function is returning some NA values, complete the list with a mean of the other values
price_values <- maize_price_jan_osm[, 2]  
# Replace NAs with the mean of the non-NA values
maize_price_jan_osm[, 2][is.na(price_values)] <- mean(price_values, na.rm = TRUE)
head(maize_price_jan_osm)
##   ID      lyr1
## 1  1 1071.4312
## 2  2  954.1059
## 3  3 1096.9810
## 4  4  964.8270
## 5  5 1079.5992
## 6  6  954.1497
library(sp)
town_prj[["maipkg"]] <- maize_price_jan_osm$lyr1
town_sp <- as(town_prj, "Spatial")

Calculate

#transportation cost. I converted 0.02 usd/kg/h to TZS/kg/hr = 54.43 
tr_cost <- 54.43 #TZS/kg/hr

# Folder to store temporary rasters
temp_dir <- "H:/Tanzania Price data/Datasets/Cost_surface data/Temp_fgate_rasters/"

if (!dir.exists(temp_dir)) {
  dir.create(temp_dir, recursive = TRUE)
}

# Loop to calculate and save each farmgate price raster
for (x in seq_along(town_sp)) {
  tmp.acc <- accCost(tran, town_sp[x,]) / 60
  market_price <- as.data.frame(town_sp)[x, "maipkg"]

  # Calculate farmgate price
  fgate_price <- market_price - (54.43 * tmp.acc)
  fgate_price[fgate_price < 0] <- 0

  # Save each raster to disk with a unique filename
  save_path <- paste0(temp_dir, "fgate_price_", x, ".tif")
  terra::writeRaster(fgate_price, save_path, overwrite = TRUE)
}

# Read all saved rasters back into a list
raster_files <- list.files(temp_dir, pattern = "\\.tif$", full.names = TRUE)
fgate_rasters <- lapply(raster_files, terra::rast)

# Combine all rasters into a single SpatRaster stack
mystack_spat <- do.call(c, fgate_rasters)
fgate_price <- max(mystack_spat) %>% resample(.,maize_price_jan_laea)  %>% mask(.,maize_price_jan_laea)
## 
|---------|---------|---------|---------|
=========================================
                                          
maize_price_jan_laea; fgate_price
## class       : SpatRaster 
## dimensions  : 2669, 2669, 1  (nrow, ncol, nlyr)
## resolution  : 500, 500  (x, y)
## extent      : -668509.2, 665990.8, -671388, 663112  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=laea +lat_0=-6 +lon_0=35 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs 
## source(s)   : memory
## name        :      lyr1 
## min value   :  864.5159 
## max value   : 1364.5247
## class       : SpatRaster 
## dimensions  : 2669, 2669, 1  (nrow, ncol, nlyr)
## resolution  : 500, 500  (x, y)
## extent      : -668509.2, 665990.8, -671388, 663112  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=laea +lat_0=-6 +lon_0=35 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs 
## source(s)   : memory
## name        :      max 
## min value   :    0.000 
## max value   : 1190.293
terra::compareGeom(maize_price_jan_laea, fgate_price)
## [1] TRUE
names(fgate_price) <- "Farm_Gate_Price"

plot(fgate_price, main="Farm gate prices Jan 2024")

names(maize_price_jan_laea) <- "Pred_Maize_price_jan"
maize_values_jan <- terra::values(maize_price_jan_laea, na.rm = TRUE)
# Get minimum and maximum values
min_maize_jan <- min(maize_values_jan, na.rm = TRUE)
max_maize_jan <- max(maize_values_jan, na.rm = TRUE)
# Define the break interval for plotting
break_interval <- 100
breaks_seq <- seq(min_maize_jan, max_maize_jan, by = break_interval)
color_palette <- rev(terrain.colors(100))
# Plot 
terra::plot(
  maize_price_jan_laea, 
  main = "Predicted Maize Price Jan 2024", 
  zlim = c(min_maize_jan, max_maize_jan), 
  col = color_palette, 
  breaks = breaks_seq
)

# Set up the layout for 2 plots side by side
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1)) 

# Plot Maize Price
plot(
  maize_price_jan_laea,
  main = "Predicted Maize Price Jan 2024",
  zlim = c(min_maize_jan, max_maize_jan),
  col = color_palette,
  breaks = breaks_seq
)

# Plot Farm Gate Price
plot(fgate_price, main="Farm gate prices Jan 2024")